명확히 의존성을 보여주는 모든 속성 짝에 대해 일부를 제거할지 둘로부터 더 새롭고 명확한 하나의 속성이 되게 디자인할지를 생각한다.
중복된 속성 탐지
# published by PACKT Publishing
#
# It is made available under the MIT License
import os
from matplotlib import pylab
import numpy as np
import scipy
from scipy.stats import norm, pearsonr
%matplotlib inline
from utils import CHART_DIR
def _plot_correlation_func(x, y):
r, p = pearsonr(x, y)
title = "Cor($X_1$, $X_2$) = %.3f" % r
pylab.scatter(x, y)
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
f1 = scipy.poly1d(scipy.polyfit(x, y, 1))
pylab.plot(x, f1(x), "r--", linewidth=2)
# pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in
# [0,1,2,3,4]])
np.random.seed(0) # to reproduce the data later on
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(0, 10, 0.2)
pylab.subplot(221)
y = 0.5 * x + norm.rvs(1, scale=.01, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(222)
y = 0.5 * x + norm.rvs(1, scale=.1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(223)
y = 0.5 * x + norm.rvs(1, scale=1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(224)
y = norm.rvs(1, scale=10, size=len(x))
_plot_correlation_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(-5, 5, 0.2)
pylab.subplot(221)
y = 0.5 * x ** 2 + norm.rvs(1, scale=.01, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(222)
y = 0.5 * x ** 2 + norm.rvs(1, scale=.1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(223)
y = 0.5 * x ** 2 + norm.rvs(1, scale=1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(224)
y = 0.5 * x ** 2 + norm.rvs(1, scale=10, size=len(x))
_plot_correlation_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
from IPython.core.display import Image, display
display(Image('http://2012.igem.org/wiki/images/thumb/7/71/DKL.png/300px-DKL.png', unconfined=False))
Information Entropy는 선형관계에만 국한되지 않는다.
다음에서 보듯이 이차함수관계에서도 Information Entropy를 적용할 수 있다.
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
import os
from matplotlib import pylab
import numpy as np
from scipy.stats import norm, entropy
from utils import CHART_DIR
def mutual_info(x, y, bins=10):
counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))
counts_x, bins = np.histogram(x, bins=bins)
counts_y, bins = np.histogram(y, bins=bins)
counts_xy += 1
counts_x += 1
counts_y += 1
P_xy = counts_xy / np.sum(counts_xy, dtype=float)
P_x = counts_x / np.sum(counts_x, dtype=float)
P_y = counts_y / np.sum(counts_y, dtype=float)
I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))
return I_xy / (entropy(counts_x) + entropy(counts_y))
def _plot_mi_func(x, y):
mi = mutual_info(x, y)
title = "NI($X_1$, $X_2$) = %.3f" % mi
pylab.scatter(x, y)
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
pylab.clf()
pylab.figure(num=None, figsize=(5, 4))
title = "Entropy $H(X)$"
pylab.title(title)
pylab.xlabel("$P(X=$coin will show heads up$)$")
pylab.ylabel("$H(X)$")
pylab.xlim(xmin=0, xmax=1.1)
x = np.arange(0.001, 1, 0.001)
y = -x * np.log2(x) - (1 - x) * np.log2(1 - x)
pylab.plot(x, y)
# pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in
# [0,1,2,3,4]])
pylab.autoscale(tight=True)
pylab.grid(True)
np.random.seed(0) # to reproduce the data later on
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(0, 10, 0.2)
pylab.subplot(221)
y = 0.5 * x + norm.rvs(1, scale=.01, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(222)
y = 0.5 * x + norm.rvs(1, scale=.1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(223)
y = 0.5 * x + norm.rvs(1, scale=1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(224)
y = norm.rvs(1, scale=10, size=len(x))
_plot_mi_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(-5, 5, 0.2)
pylab.subplot(221)
y = 0.5 * x ** 2 + norm.rvs(1, scale=.01, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(222)
y = 0.5 * x ** 2 + norm.rvs(1, scale=.1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(223)
y = 0.5 * x ** 2 + norm.rvs(1, scale=1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(224)
y = 0.5 * x ** 2 + norm.rvs(1, scale=10, size=len(x))
_plot_mi_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
속성에 대한 모델 찾기
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
# It is made available under the MIT License
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=100, n_features=10, n_informative=3, random_state=0)
clf = LogisticRegression()
clf.fit(X, y)
print("%s\t/\t%s\t/\t%s" % ("# of features","index of features","rank of features"))
for i in range(1, 11):
selector = RFE(clf, i)
selector = selector.fit(X, y)
print("%i\t%s\t%s" % (i, selector.support_, selector.ranking_))
selector = RFE(clf, 1)
selector = selector.fit(X, y)
print("Ranking(high priority) : %s" % selector.ranking_)
display(Image('./photo1.png', unconfined=False))
display(Image('http://freesearch.pe.kr/wp-content/uploads/lasso_fig-1.png', unconfined=False))
display(Image('./photo2.png', unconfined=False))
display(Image('./photo3.png', unconfined=False))
display(Image('./photo4.png', unconfined=False))
display(Image('./photo5.png', unconfined=False))
display(Image('./photo14.png', unconfined=False))
display(Image('./photo6.png', unconfined=False))
display(Image('./photo7.png', unconfined=False))
display(Image('./photo10.png', unconfined=False))
display(Image('./photo11.png', unconfined=False))
선형기법이지만 모델을 충분히 잘 학습시킨다.
PCA는 LDA처럼 다른 훈련 데이터에 민감하지 않다.
속성의 개수를 낮추거나 적용해야 할 속성 추출 기법을 모를 경우 주로 첫 번째로 시도하는 기법이다.
그래서 상황에 따라 맞는 방법을 찾아야 한다.
PCA는 본래 속성 공간이 주어지면 PCA는 다음 특성이 있는 낮은 차원 공간으로 선형 투영을 찾는다.
단순히 입력 데이터를 변환하기 떄문에 분류와 회귀 문제 둘 다에 적용할 수 있다.
여기서는 분류 작업을 사용할 것이다.
N개의 속성으로 선택하면 알고리즘은 변환된 N차원 속성을 반환한다.
display(Image('./photo11.png', unconfined=False))
display(Image('./photo11.png', unconfined=False))
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
import os
from matplotlib import pylab
import numpy as np
from sklearn import linear_model, decomposition
from sklearn import lda
%matplotlib inline
logistic = linear_model.LogisticRegression()
from utils import CHART_DIR
Example 1
np.random.seed(3)
x1 = np.arange(0, 10, .2)
x2 = x1 + np.random.normal(scale=1, size=len(x1))
pylab.clf()
fig = pylab.figure(num=None, figsize=(10, 4))
pylab.subplot(121)
title = "Original feature space"
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
x1 = np.arange(0, 10, .2)
x2 = x1 + np.random.normal(scale=1, size=len(x1))
good = (x1 > 5) | (x2 > 5)
bad = ~good
x1g = x1[good]
x2g = x2[good]
pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
x1b = x1[bad]
x2b = x2[bad]
pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
pylab.grid(True)
pylab.subplot(122)
X = np.c_[(x1, x2)]
pca = decomposition.PCA(n_components=1)
Xtrans = pca.fit_transform(X)
Xg = Xtrans[good]
Xb = Xtrans[bad]
pylab.scatter(
Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
pylab.scatter(
Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
title = "Transformed feature space"
pylab.title(title)
pylab.xlabel("$X'$")
fig.axes[1].get_yaxis().set_visible(False)
print(pca.explained_variance_ratio_)
pylab.grid(True)
pylab.autoscale(tight=True)
Example 2
단점
pylab.clf()
fig = pylab.figure(num=None, figsize=(10, 4))
pylab.subplot(121)
title = "Original feature space"
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
x1 = np.arange(0, 10, .2)
x2 = x1 + np.random.normal(scale=1, size=len(x1))
good = x1 > x2
bad = ~good
x1g = x1[good]
x2g = x2[good]
pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
x1b = x1[bad]
x2b = x2[bad]
pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
pylab.grid(True)
pylab.subplot(122)
X = np.c_[(x1, x2)]
pca = decomposition.PCA(n_components=1)
Xtrans = pca.fit_transform(X)
Xg = Xtrans[good]
Xb = Xtrans[bad]
pylab.scatter(
Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
pylab.scatter(
Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
title = "Transformed feature space"
pylab.title(title)
pylab.xlabel("$X'$")
fig.axes[1].get_yaxis().set_visible(False)
print(pca.explained_variance_ratio_)
pylab.grid(True)
pylab.autoscale(tight=True)
단점
pylab.clf()
fig = pylab.figure(num=None, figsize=(10, 4))
pylab.subplot(121)
title = "Original feature space"
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
good = x1 > x2
bad = ~good
x1g = x1[good]
x2g = x2[good]
pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
x1b = x1[bad]
x2b = x2[bad]
pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
pylab.grid(True)
pylab.subplot(122)
X = np.c_[(x1, x2)]
lda_inst = lda.LDA(n_components=1)
Xtrans = lda_inst.fit_transform(X, good)
Xg = Xtrans[good]
Xb = Xtrans[bad]
pylab.scatter(
Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
pylab.scatter(
Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
title = "Transformed feature space"
pylab.title(title)
pylab.xlabel("$X'$")
fig.axes[1].get_yaxis().set_visible(False)
pylab.grid(True)
pylab.autoscale(tight=True)
차원을 축소할 떄 가능한 한 많이 상대적인 거리를 보유하고자 한다.
이는 고차원 데이터 셋에 대해 시각적 개괄을 얻고자 할 때 유용하다.
데이터 점 자체에 연연하지 않고, 대신에 데이터 점들 짝 간의 차이점에 관심을 갖고 거리로서 이들을 해석한다.
MDS는 본래 공간에서의 점들 거리와 최대한 유사한 새로운 거리로 낮은 차원 공간에 개별 데이터 점을 놓는다.
MDS는 시각화를 위해 자주 사용되기 때문에,, 낮은 차원의 선택이 대부분 2, 3이다.
# This code is supporting material for the book
# Building Machine Learning Systems with Python
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License
import os
import numpy as np
from matplotlib import pylab
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model, manifold, decomposition, datasets
logistic = linear_model.LogisticRegression()
from utils import CHART_DIR
np.random.seed(3)
# all examples will have three classes in this file
colors = ['r', 'g', 'b']
markers = ['o', 6, '*']
X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T
y = np.array([0, 1, 2])
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
mds = manifold.MDS(n_components=3)
Xtrans = mds.fit_transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on example data set in 3 dimensions")
ax.view_init(10, -15)
mds = manifold.MDS(n_components=2)
Xtrans = mds.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on example data set in 2 dimensions")
iris = datasets.load_iris()
X = iris.data
y = iris.target
# MDS
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
mds = manifold.MDS(n_components=3)
Xtrans = mds.fit_transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on Iris data set in 3 dimensions")
ax.view_init(10, -15)
mds = manifold.MDS(n_components=2)
Xtrans = mds.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on Iris data set in 2 dimensions")
# PCA
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
pca = decomposition.PCA(n_components=3)
Xtrans = pca.fit(X).transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("PCA on Iris data set in 3 dimensions")
ax.view_init(50, -35)
pca = decomposition.PCA(n_components=2)
Xtrans = pca.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("PCA on Iris data set in 2 dimensions")
# LDA
from sklearn import linear_model, decomposition
from sklearn import lda
pylab.figure(num=None,figsize=(5, 4))
title = "LDA on Iris data set in 1 dimensions"
pylab.title(title)
pylab.xlim(xmin=-4, xmax=4)
lda_inst = lda.LDA(n_components=3)
Xtrans = lda_inst.fit_transform(X, y==cl)
for cl, color, marker in zip(np.unique(y), colors, markers):
pylab.scatter(Xtrans[y == cl][:, 0], np.zeros(len(Xtrans[y==cl])) , c=color, marker=marker, edgecolor='black')
pylab.autoscale(tight=True)
pylab.grid(True)